sampleID <- "all" #a name for your sample, should be the same as the "assay = assayID" in the box below
outdir <- "/Volumes/STNNHPC-Q1139/Laura/Covid19_ST/2_output/20210409_PreliminaryPipeline/outdir_alltissues/" #where to store the output
df <- readRDS("/Volumes/STNNHPC-Q1139/Laura/Covid19_ST/2_output/20210409_Demultiplex/outdir/covid.RDS")
library(Seurat)
library(ggplot2)
library(dplyr)
library(clustree)
library(scran)
library(scater)
library(PCAtools)
library(tibble)
library(SingleCellExperiment)
to.pdf <- function(expr, filename, ...) {
pdf(filename, ...)
on.exit(dev.off())
print(eval.parent(substitute(expr)))
}
# function to remove cells with high mitochondrial/ribosomal percentages
func_assessMT.RT <- function(seuratObj, sampleID) {
# USAGE: mySCE <- func_assessMT.RT(seuratObj, sampleID)
# find percentage of mt/rb genes per cell
# Human-specific. For mouse, change to "^mt-" (mitochondria) or "^Rps|^Rpl" (ribosomes)
seuratObj[["percent.mt"]] <- PercentageFeatureSet(seuratObj, pattern = "^MT-")
seuratObj[["percent.rb"]] <- PercentageFeatureSet(seuratObj, pattern = "^RPS|^RPL")
# define some graph functions which will be run with `to.pdf` later
## spatial map of mitochondrial expression
fig.mitochondrialPercentage.spatial <- function() {
SpatialFeaturePlot(seuratObj, features = "percent.mt") +
ggtitle(paste0(sampleID, " - percentage mitochondrial genes per spot")) +
theme(legend.position = "right")
}
## mitochondrial % only
fig.mitochondriaPercentage <- function() {
VlnPlot(seuratObj, features = c("percent.mt")) +
ggtitle(paste0(sampleID, " - percentage mitochondrial genes per cell")) +
ylab("% mitochondrial genes") +
geom_hline(yintercept = 20, linetype = "dashed", color = "blue") +
theme(plot.title = element_text(size = 12, face = "bold")) +
theme_bw() +
theme(legend.position = "none")
}
## spatial map of ribosomal expression
fig.ribosomePercentage.spatial <- function() {
SpatialFeaturePlot(seuratObj, features = "percent.rb") +
ggtitle(paste0(sampleID, " - percentage ribosomal genes per spot")) +
theme(legend.position = "right")
}
## ribosomal % only
fig.ribosomePercentage <- function() {
VlnPlot(seuratObj, features = c("percent.rb")) +
ggtitle(paste0(sampleID, " - percentage ribosomal genes per cell")) +
ylab("% ribosomal genes") +
geom_hline(yintercept = 50, linetype = "dashed", color = "blue") +
theme(plot.title = element_text(size = 12, face = "bold")) +
theme_bw() +
theme(legend.position = "none")
}
## mitochondria % vs nFeatures
fig.mitochondriaVSfeatures <- function() {
basicplot <- FeatureScatter(seuratObj, feature1 = "nFeature_Spatial", feature2 = "percent.mt", pt.size = 0.5)
pearson <- basicplot$labels$title
basicplot +
ggtitle(paste0(sampleID, " - percentage mitochondrial genes vs number of genes per cell (pearson = ", pearson, ")")) +
ylab("% mitochondrial genes") +
xlab("number of genes") +
geom_hline(yintercept = c(10, 20), linetype="dashed", color = "blue") +
geom_vline(xintercept = c(3000), linetype="dashed", color = "blue") +
theme(plot.title = element_text(size = 12, face = "bold")) +
theme_bw() +
theme(legend.position = "none")
}
## ribosomal % vs nFeatures
fig.ribosomeVSfeatures <- function() {
basicplot <- FeatureScatter(seuratObj, feature1 = "nFeature_Spatial", feature2 = "percent.rb", pt.size = 0.5)
pearson <- basicplot$labels$title
basicplot +
ggtitle(paste0(sampleID, " - percentage ribosomal genes vs number of genes per cell (pearson = ", pearson, ")")) +
ylab("% ribosomal genes") +
xlab("number of genes") +
geom_hline(yintercept = 50, linetype="dashed", color = "blue") +
geom_vline(xintercept = c(3000), linetype="dashed", color = "blue") +
theme(plot.title = element_text(size = 12, face = "bold")) +
theme_bw() +
theme(legend.position = "none")
}
# Run the figure functions and save graphs as PDFs
## if you just want to view the figures, just run the function name e.g. `fig.mitochondriaPercentage()`
to.pdf(fig.mitochondriaPercentage(), paste0(outdir, sampleID, "_percentMitochondria_unfiltered.pdf"))
to.pdf(fig.mitochondrialPercentage.spatial(), paste0(outdir, sampleID, "_percentMitochondriaSpatial_unfiltered.pdf"))
to.pdf(fig.ribosomePercentage(), paste0(outdir, sampleID, "_percentRibosome_unfiltered.pdf"))
to.pdf(fig.ribosomePercentage.spatial(), paste0(outdir, sampleID, "_percentRibosomeSpatial_unfiltered.pdf"))
to.pdf(fig.mitochondriaVSfeatures(), paste0(outdir, sampleID, "_percentMitochondriaVsnFeatures.pdf"))
to.pdf(fig.ribosomeVSfeatures(), paste0(outdir, sampleID, "_percentRibosomesVsnFeatures.pdf"))
# subset input data to cells under mt/rb thresholds
# NOTE: Though the following command looks wrong (extra ,) but it works (`SingleCellExperiment` manual, function `SCE-combine`, p12)
seuratObj <- subset(seuratObj, subset = percent.mt < 50 & percent.rb < 50)
# return mySCE to main R environment
return(seuratObj)
}
# ------------------------------------------------------------------
# QC - CELL CYCLE
# ------------------------------------------------------------------
func_predictCellCycle <- function(seuratObj, myspecies="mouse"){
# USAGE: seuratObj <- func_predictCellCycle(seuratObj, "mouse")
# OUTPUT: a Seurat object with S/G2M-phase scores and cell stage (G1, S, G2M) calls
# specify the gene set used for Cell Cycle Scoring (human or mouse)
if (identical(myspecies, "mouse")) {
load("/Volumes/STNNHPC-Q1139/Laura/Covid19_ST/1_code/mouse.cc.genes.Rdata")
geneset <- mouse.cc.genes
} else if (identical(myspecies, "human")) {
geneset <- cc.genes.updated.2019
} else {
stop("The 'species' argument must be mouse or human")
}
# make a Seurat object, normalise, run prediction
# note: we use Seurat's default normalisation tool for the cell phase assessment (quick and dirty). Later we will use Scran for the normal normalisation
seuratObj <- NormalizeData(seuratObj,
normalization.method = "LogNormalize",
scale.factor = 10000)
seuratObj <- CellCycleScoring(seuratObj,
s.features = geneset$s.genes,
g2m.features = geneset$g2m.genes,
set.ident = TRUE)
# define some graph functions which will be run with `to.pdf` later
fig.cellcycle.bar <- function() {
myscale <- round(max(table(seuratObj$Phase)), -3) #scale
mybar <- barplot(table(seuratObj$Phase),
ylim = (c(0, myscale)),
main = paste0("Cell Phases in ", sampleID),
xlab = "cell phase",
ylab = "# cells",
col = "white")
text(mybar,
table(seuratObj$Phase)+100,
paste("n: ", table(seuratObj$Phase), sep=""), cex = 1)
}
fig.cellcycle.pie <- function() {
pie(table(seuratObj$Phase),
labels = table(seuratObj$Phase),
col = c("bisque", "cornflowerblue", "cadetblue2"),
main = paste0("Cell phases in ", sampleID))
legend("topright", c("G1", "G2M", "S"), cex = 0.8, fill = c("bisque", "cornflowerblue", "cadetblue2"))
}
# spatial plots
fig.cellcycle.spatial <- function() {
SpatialDimPlot(seuratObj, group.by = "Phase") +
theme(legend.position = "right")
}
# Run the figure functions and save graphs as PDFs
to.pdf(fig.cellcycle.bar(), paste0(outdir, sampleID, "_CellCycle_bar.pdf"))
to.pdf(fig.cellcycle.pie(), paste0(outdir, sampleID, "_CellCycle_pie.pdf"))
to.pdf(fig.cellcycle.spatial(), paste0(outdir, sampleID, "_CellCycle_spatial.pdf"))
# return the updated SCE
return(seuratObj)
}
# ------------------------------------------------------------------
# NORMALISATION
# ------------------------------------------------------------------
# function to normalise count data in scran/scater
func_scranNorm <- function(seuratObj) {
# USAGE: mySCE <- func_scranNorm(seuratObj)
# OUTPUT: a SCE object with (natural log) normalised counts - the counts need to be added into the Seurat object, but indirectly to keep the image data
# NOTE: usually, scran normalisation produces log2counts, but here we produce seurat-compatible lncounts
# convert to SCE object
mySCE <- as.SingleCellExperiment(seuratObj)
# calculate size factors and perform normalisation
scranclusters <- quickCluster(mySCE)
mySCE <- computeSumFactors(mySCE, clusters = scranclusters)
# "scran sometimes calculates negative or zero size factors which will completely distort the normalized expression matrix". Let's check
minsizefactor <- min(sizeFactors(mySCE))
if (minsizefactor < 0) {
warning("ALERT! scran normalisation has produced negative or zero size factors which will distort the normalised expression matrix. Proceed with care!\n You can try increasing the cluster and pool sizes until they are all positive\n See https://biocellgen-public.svi.edu.au/mig_2019_scrnaseq-workshop/public/normalization-confounders-and-batch-correction.html")
}
mySCE <- scater::logNormCounts(mySCE, log = FALSE, name = "unlog.normcounts")
# natural-log transform counts and convert back to sparse matrix format
assay(mySCE, "ln.normcounts") <- as(log(x = assay(mySCE, "unlog.normcounts") + 1), "dgCMatrix")
return(mySCE)
# NOTE: To convert to Seurat object from now on your must run:
# seuratObj <- as.Seurat(mySCE, counts = "counts", data = "ln.normcounts")
}
func_ScaleData <- function(seuratObj) {
# USAGE: seuratObj <- func_ScaleData(mySCE)
# OUTPUT: a Seurat object with scaled normalised counts
# find variable features, perform scaling
seuratObj <- FindVariableFeatures(seuratObj, selection.method = "vst", nfeatures = 2000)
seuratObj <- ScaleData(seuratObj)
# convert to SCE object
#mySCE <- as.SingleCellExperiment(seuratObj)
# alternatively, don't convert back fresh, just insert the scaledata as a new assay type
#return(mySCE)
# for now, just return the Seurat object
return(seuratObj)
}
# ------------------------------------------------------------------
# PCA
# ------------------------------------------------------------------
# function to find variable features and scale data using Seurat
func_runPCA <- function(seuratObj, runJackstraw = "TRUE") {
# USAGE: seuratObj <- func_runPCA(seuratObj, runJackstraw = "TRUE" or "FALSE")
# OUTPUT: a Seurat object with PCA run
# Run PCA
seuratObj <- RunPCA(seuratObj, features = VariableFeatures(object = seuratObj), npcs = 50)
# calculate variance explained by each PC
total_variance <- seuratObj@reductions$pca@misc$total.variance
eigValues <- (seuratObj[["pca"]]@stdev)^2
varExplained <- eigValues / total_variance
varExplained.cum <- cumsum(varExplained)
### how many PCs before 20 % of the variance is explained?
var.20pc <- sum(varExplained.cum <= 0.2)
### how much variance do 50 PCs explain?
varpc.50PCA <- 100*(varExplained.cum[50])
print(paste0("The first 50 PCs explain ", round(varpc.50PCA), "% of the variance. 20% of the variance is explained by the first ", var.20pc, " PCs"))
# define some graph functions which will be run with `to.pdf` later
## scree plot
fig.scree <- function() {
varExplained %>% enframe(name = "PC", value = "varExplained" ) %>%
ggplot(aes(x = PC, y = varExplained)) +
theme_bw() +
geom_bar(stat = "identity") +
theme_classic() +
ggtitle(paste0(sampleID, ": scree plot")) +
ylab("explained variance")
}
## cumulative variance
fig.cumulativeVar <- function() {
ggplot(as.data.frame(varExplained.cum), aes(y = varExplained.cum, x = seq(1, length(varExplained.cum)))) +
geom_point(size = 1) +
theme_bw() +
ggtitle("cumulative variance explained by increasing PCs") +
xlab("PCs") +
ylab("cumulative explained variance") +
geom_hline(yintercept = c(0.2), linetype = "dashed", color = "blue") +
geom_vline(xintercept = c(20), linetype = "dashed", color = "blue")
}
# Make an elbow plot with elbow point annotated (adapted from Seurat's ElbowPlot() but to show all tested PCs)
fig.elbow <- function() {
ElbowPlot(seuratObj, ndims = 50, reduction = "pca") +
theme_bw() +
ggtitle(paste0(sampleID, ": elbow plot of standard deviations of principal components"))
}
# Perform JackStraw analysis
if (runJackstraw == "TRUE") {
seuratObj <- JackStraw(seuratObj, num.replicate = 100, dims = 50)
seuratObj <- ScoreJackStraw(seuratObj, dims = 1:50) # because `RunPCA` calculates 50x PCs by defalt (you can change this)
fig.jackstraw <- function() {
JackStrawPlot(seuratObj, dims = 1:50) +
ggtitle("PCA JackStraw")
}
# the PC p-vals are in seuratObj@reductions$pca@jackstraw$overall.p.values
# get the PC number of the last PC before one is not significant
jscores <- as.data.frame(seuratObj@reductions$pca@jackstraw$overall.p.values > 0.05)
chosen.jack <- as.numeric(rownames(jscores[jscores$Score == "TRUE", ][1,])) - 1
to.pdf(fig.jackstraw(), paste0(outdir, sampleID, "_PCA_jackstraw.pdf"))
} else {
if (runJackstraw == "FALSE") {
print("skipping Jackstraw analysis")
} else {
stop("runJackstraw must be TRUE or FALSE")
}
}
# Run the figure functions and save graphs as PDFs
to.pdf(fig.scree(), paste0(outdir, sampleID, "_scree.pdf"))
to.pdf(fig.cumulativeVar(), paste0(outdir, sampleID, "_cumulativeVariance.pdf"))
to.pdf(fig.elbow(), paste0(outdir, sampleID, "_PCA_elbow.pdf"))
# to.pdf(fig.jackstraw(), paste0(outdir, "figs/", sampleID, "_PCA_jackstraw.pdf")) #run above in if/else bit
# for now, just return the Seurat object
return(seuratObj)
}
func_runNonLinearDR <- function(seuratObj, runTSNE = "TRUE") {
# USAGE: seuratObj <- func_runNonLinearDR(seuratObj, runTSNE = "TRUE" or "FALSE")
# OUTPUT: a Seurat object with tSNE and UMAP coordinates
# Run UMAP
seuratObj <- Seurat::RunUMAP(seuratObj, dims = 1:20, n.neighbors = 5, min.dist = 0.1)
fig.umap.raw <- function() {
# alternative to DimPlot(seuratObj, reduction = "umap")
Embeddings(seuratObj, reduction = "umap") %>%
as.data.frame() %>%
ggplot(aes(x = UMAP_1, y = UMAP_2)) +
geom_point(size = 0.3) +
theme_bw(base_size = 14) +
ggtitle(paste0(sampleID, ": UMAP"))
}
to.pdf(fig.umap.raw(), paste0(outdir, sampleID, "_UMAP_raw.pdf"))
# Run tSNE
if (runTSNE == "TRUE") {
seuratObj <- Seurat::RunTSNE(seuratObj, dims = 1:50)
fig.tSNE.raw <- function() {
# alternative to DimPlot(seuratObj, reduction = "tSNE")
Embeddings(seuratObj, reduction = "tsne") %>%
as.data.frame() %>%
ggplot(aes(x = tSNE_1, y = tSNE_2)) +
geom_point(size = 0.3) +
theme_bw(base_size = 14) +
ggtitle(paste0(sampleID, ": tSNE"))
}
to.pdf(fig.tSNE.raw(), paste0(outdir, sampleID, "_tSNE_raw.pdf"))
} else {
if (runTSNE == "FALSE") {
print("skipping tSNE plot")
} else {
stop("runTSNE must be TRUE or FALSE")
}
}
return(seuratObj)
}
# look at the number of features and cells
df
An object of class Seurat
36601 features across 2997 samples within 1 assay
Active assay: Spatial (36601 features, 0 variable features)
Now we will look at QC plots. The nFeature measure shows us the number of genes per spot, while nCount refers to the number of RNA transcripts (i.e. total counts) per spot. We can either visualise these measures on their own as violin plots, or we can plot them together on a scatter plot, where we expect the trend to be roughly diagonal. If we see outliers from this diagonal, they are indicative of weird spots.
# Look at some QC plots
VlnPlot(df, features = c("nFeature_Spatial", "nCount_Spatial"), group.by = "orig.ident")
ggsave(paste0(outdir, sampleID, "_countsAndFeatures.pdf"))
Saving 12 x 7.42 in image
FeatureScatter(df, feature1 = "nFeature_Spatial", feature2 = "nCount_Spatial", group.by = "orig.ident") + NoLegend()
ggsave(paste0(outdir, sampleID, "_scatter.pdf"))
Saving 12 x 7.42 in image
Filter out spots with low counts and features (requires at least 100 counts and 100 feature per spot)
df <- subset(df, subset = nCount_Spatial > 100 & nFeature_Spatial > 100)
df
An object of class Seurat
36601 features across 2969 samples within 1 assay
Active assay: Spatial (36601 features, 0 variable features)
Now we’ll look for spots with excessively high percentages of ribosomal or mitochondrial genes, which may further indicate a quality problem. We’re arbitrarily going to filter spots with >50% mitochondrial genes and/or >50% ribosomal genes. To see the “before filtering” plots, have a look at the accompanying plots directory.
df <- func_assessMT.RT(df, sampleID)
# this is what the data look like, post-filtering
SpatialFeaturePlot(df, features = "percent.mt") + theme(legend.position = "right")
ggsave(paste0(outdir, sampleID, "_percentMitochondriaSpatial_filtered.pdf"))
Saving 8 x 4.94 in image
SpatialFeaturePlot(df, features = "percent.rb") + theme(legend.position = "right")
ggsave(paste0(outdir, sampleID, "_percentRibosomeSpatial_filtered.pdf"))
Saving 8 x 4.94 in image
And just check in on how many genes/cells remain in our dataset:
df
An object of class Seurat
36601 features across 2969 samples within 1 assay
Active assay: Spatial (36601 features, 0 variable features)
Now we will do a cell cycle prediction. This method looks at certain marker genes associated with different phases of mitosis, and is described in a Seurat vignette. This prediction is typically used for single cell data, so it’s possible it won’t perform as well here with ST data. For more information about this analysis, see the Seurat vignette. However, unlike in the Seurat vignette, we aren’t going to include this data in any regression steps - we are just interested in seeing the trends across our tissue.
df <- func_predictCellCycle(df, "human")
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
NULL
$rect
$rect$w
[1] 0.2917558
$rect$h
[1] 0.267907
$rect$left
[1] 0.9138256
$rect$top
[1] 1.08
$text
$text$x
[1] 1.054477 1.054477 1.054477
$text$y
[1] 1.0130233 0.9460465 0.8790698
Let’s visualise the results. We’ll grey out the G1-phase spots so we highlight those that are dividing.
SpatialDimPlot(df, group.by = "Phase") + theme(legend.position = "right") + scale_fill_manual(values = c("grey", "#F4A698", "#DD614A", "black"))
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing
scale.
ggsave(paste0(outdir, sampleID, "_CellCycle_spatialPretty.pdf"))
Saving 10 x 10 in image
Here we diverge from the Seurat pipeline to run Scran normalisation instead.
df.sce <- func_scranNorm(df)
df.temp <- as.Seurat(df.sce, counts = "counts", data = "ln.normcounts")
df@assays$Spatial@counts <- df.temp@assays$RNA@counts
df@assays$Spatial@data <- df.temp@assays$RNA@data
df <- func_ScaleData(df)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Centering and scaling data matrix
|
| | 0%
|
|================================================= | 50%
|
|==================================================================================================| 100%
df <- func_runPCA(df, runJackstraw = "FALSE")
PC_ 1
Positive: IGKC, AD000090.1, MTRNR2L12, PIM3, IGSF6, IGLC7, QSER1, AKAP9, RALGPS2, LINC01783
POC1B-AS1, RAPGEF4, AC007406.5, ARMT1, ZNF254, TTC39B, USP49, SIVA1, ATPSCKMT, TENT4A
DPEP2, RAB30, Z83843.1, ZMYND10, TMEM138, ZNF720, SLC35B2, ADGRF4, PKDCC, ADCY5
Negative: COL1A1, HLA-B, TIMP1, DCN, HLA-A, SPARC, COL1A2, COL3A1, AC007952.4, B2M
IFI6, EFEMP1, CXCL10, ADH1B, IGFBP7, SERPINH1, MT-ATP6, CLU, MIF, MT-CO2
IGHG4, PDIA3, VCAN, C1QC, MT-ND4, RARRES1, PPIB, PRG4, IGHA1, ZFP36L1
PC_ 2
Positive: IGKC, HLA-B, B2M, IFI6, COL3A1, TIMP1, SCGB3A1, DCN, HLA-A, IGHG1
IGLC1, CLU, SERPINF1, IGHA1, HSD11B1, HBB, COL1A1, TNC, EEF1D, JCHAIN
METTL7A, IGHM, XIST, FHL1, IGHG4, ADAMTSL2, AC007952.4, SRGN, COMMD3, PRG4
Negative: AD000090.1, MALAT1, MT-ND2, MT-CYB, MT-ND4, MT-ND1, MT-ND3, MTRNR2L12, MT-ATP6, AL627171.2
MT-CO1, MT-ND5, MT-CO2, PABPC1, MTHFR, AASS, STRIP1, GJA4, RMRP, TRIB3
PHLDA2, MACF1, WDR48, UBXN7, MARCH7, SGPL1, UBE2V2, DHCR7, RBMS2, RBM39
PC_ 3
Positive: TGFB1I1, IGLC1, PIM3, IGLC2, SCGB3A1, LRRC8A, PTDSS1, DELE1, ATP11B, USP33
PSMB10, NOTCH2, SMIM15, IRF2, SLBP, PEMT, AQP3, MT-ND3, RBM39, RBMS3
IGKC, CPSF4, MYH10, TCP1, ARHGDIA, HERC1, PITRM1, SCO2, SCARA5, WLS
Negative: UBE2V2, FBN1, PAX8-AS1, POGLUT3, GALNS, TYROBP, ERAP2, CDH5, LPAR1, ABCA8
MRPL50, SEC31A, CSF3R, CRAT, TOR1B, DGKZ, MNDA, LMBR1, NDUFB2, AHDC1
VBP1, S1PR2, CSTF2T, LTV1, NCOR1, TBL2, XPNPEP1, C1QC, POGLUT1, UBE2E1
PC_ 4
Positive: LTBP1, DOCK2, ARL6IP4, GNAS, CLTC, CCL21, CDC27, SIN3B, HERPUD1, HVCN1
NCOR1, SERPINE2, UBXN1, PSMB8-AS1, ANKRD17, PPM1G, NCSTN, DRAM2, CMTM7, ZNF24
PDGFB, FAM3C, MNDA, GJA4, FAM20A, ANP32B, CHD9, TARS, DHCR7, RAP1A
Negative: CCDC92, SFRP2, NCOR2, NCEH1, EFEMP1, GLG1, RAP2A, PBDC1, LPAR1, ATP6V1C1
STEAP4, MMP1, RAB11A, WBP1, BAIAP2, XPNPEP1, SELENOS, REPS1, PRDX3, PITRM1
NDUFS8, TUBB2A, DLAT, SFSWAP, TRIM69, POLG, SACM1L, MZB1, HGH1, DDR2
PC_ 5
Positive: PDIA3, HGF, ENPEP, CKAP4, NDUFS4, ST13, HERC6, CXCL14, MTHFR, PEMT
NAPRT, SSBP3, PTPRK, NIPA2, EIF3I, TXLNA, FHL1, RAP1A, SFSWAP, LRRFIP1
MX2, RARRES1, TFRC, PDLIM3, TRIB1, WBP1, RNPS1, SLC44A4, DCTN6, EFR3A
Negative: CCDC92, PPM1G, BAIAP2, ELL2, TIE1, PLXNB1, BHLHE40, DDA1, MAPK7, ACSL1
PIM3, CCSER2, PRG4, CHST15, WIPI1, TXN2, NCEH1, PHLDA2, CS, ARMC6
TMEM150A, DDX41, ZNRF1, FAM160A2, CSTF2T, GLRX, NCLN, PCNA, DLAT, LARP1
[1] "The first 50 PCs explain 9% of the variance. 20% of the variance is explained by the first 50 PCs"
[1] "skipping Jackstraw analysis"
ElbowPlot(df, ndims = 50)
ggsave(paste0(outdir, sampleID, "_elbowplot.jpeg"))
Saving 7.29 x 4.51 in image
df <- func_runNonLinearDR(df, runTSNE = "TRUE") # 50 dims
The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session13:15:05 UMAP embedding parameters a = 1.577 b = 0.8951
13:15:05 Read 2969 rows and found 20 numeric columns
13:15:05 Using Annoy for neighbor search, n_neighbors = 5
13:15:05 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:15:05 Writing NN index file to temp file /var/folders/8w/nx6ng4zj0rg143r293dq3mf40000gp/T//Rtmp7bVTx4/file69b6bf5c152
13:15:05 Searching Annoy index using 1 thread, search_k = 500
13:15:06 Annoy recall = 100%
13:15:06 Commencing smooth kNN distance calibration using 1 thread
13:15:08 Initializing from normalized Laplacian + noise
13:15:08 Commencing optimization for 500 epochs, with 16876 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:15:10 Optimization finished
df <- FindNeighbors(df, reduction = "pca", dims = 1:20)
Computing nearest neighbor graph
Computing SNN
DimPlot(df, reduction = "umap", group.by = "orig.ident")
DimPlot(df, reduction = "tsne", group.by = "orig.ident")
Now we’re going to cluster the cells. First, we’ll make a temporary R object df.2 and test a range of different resolution values. The resolution parameter “control[s] the size and structure of communities that are formed by optimizing a generalized objective function”. Effectively, an increased resolution = more clusters - though you can’t tell Seurat to give you exactly N clusters, and often different resolution values will give the same number of clusters.
Look at the plots
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.0") + ggtitle("res = 0")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.0.1") + ggtitle("res = 0.1")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.0.2") + ggtitle("res = 0.2")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.0.3") + ggtitle("res = 0.3")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.0.4") + ggtitle("res = 0.4")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.0.6") + ggtitle("res = 0.6")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.0.8") + ggtitle("res = 0.8")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.1") + ggtitle("res = 1")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.1.2") + ggtitle("res = 1.2")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.1.4") + ggtitle("res = 1.4")
DimPlot(df.2, reduction = "umap", group.by = "Spatial_snn_res.1.6") + ggtitle("res = 1.6")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.0") + ggtitle("res = 0")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.0.1",) + ggtitle("res = 0.1")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.0.2",) + ggtitle("res = 0.2")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.0.3",) + ggtitle("res = 0.3")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.0.4") + ggtitle("res = 0.4")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.0.6") + ggtitle("res = 0.6")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.0.8") + ggtitle("res = 0.8")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.1") + ggtitle("res = 1")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.1.2") + ggtitle("res = 1.2")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.1") + ggtitle("res = 1.4")
SpatialDimPlot(df.2, group.by = "Spatial_snn_res.1.2") + ggtitle("res = 1.6")
Now we want to choose a resolution value. One method to do this uses the R package Clustree. This is the R package description: “Deciding what resolution to use can be a difficult question when approaching a clustering analysis. One way to approach this problem is to look at how samples move as the number of clusters increases. This package allows you to produce clustering trees, a visualisation for interrogating clusterings as resolution increases.” It will generate a tree diagram showing how the different clusterings are inter-related. The clusters in this diagram will be coloured different shades of blue, representing “sc3 stability”. This is a “Stability index [that] shows how stable each cluster is accross the selected range of k. The stability index varies between 0 and 1, where 1 means that the same cluster appears in every solution for different k”
clust <- clustree(df.2, prefix = "Spatial_snn_res.", node_colour = "sc3_stability", edge_width = 1, node_text_colour = "white", node_label_size = 4, layout = "tree", edge_arrow = FALSE)
The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Please use the `.add` argument instead.
clust
ggsave(plot = clust, file = paste0(outdir, sampleID, "_clustree.pdf"), width = 10, height = 10)
# extract the stability values for the different resolutions
stability <- clust$data[,c("Spatial_snn_res.", "sc3_stability")]
write.table(stability, file = paste0(outdir, "clustree_stability.txt"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
# here we're going to work out which of the possible clustering resolutions is the most stable (i.e. gives the highest average sc3 stability score).
stability <- stability[stability$Spatial_snn_res. %in% names(which(table(stability$Spatial_snn_res.) > 1)), ]
stability.ave <- aggregate(as.numeric(stability$sc3_stability), list(stability$Spatial_snn_res.), mean)
rownames(stability.ave) <- stability.ave$Group.1
stability.ave$Group.1 <- NULL
stability.ave.no0 <- stability.ave[2:nrow(stability.ave), , drop = FALSE]
bestres <- as.numeric(rownames(stability.ave.no0)[which.max(stability.ave.no0$x)])
stability.ave
bestres
[1] 0.3
The value printed above is the resolution parameter that produced the highest average stability.
rm(df.2)
df <- FindClusters(df, resolution = bestres)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2969
Number of edges: 116001
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.7876
Number of communities: 3
Elapsed time: 0 seconds
mycol <- paste0("Spatial_snn_res.", bestres)
DimPlot(df, group.by = mycol)
ggsave(paste0(outdir, sampleID, "UMAP_res", bestres, ".pdf"))
Saving 7.29 x 4.51 in image
SpatialDimPlot(df, group.by = mycol)
ggsave(paste0(outdir, sampleID, "spatial_res", bestres, ".pdf"))
Saving 7.29 x 4.51 in image
mycol <- paste0("Spatial_snn_res.", bestres)
Idents(df) <- mycol
markers <- FindAllMarkers(df, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
Calculating cluster 0
| | 0 % ~calculating
|+++++++++++++++++++++++++ | 50% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s
Calculating cluster 1
| | 0 % ~calculating
|+++++++++++++++++++++++++ | 50% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s
Calculating cluster 2
| | 0 % ~calculating
|++ | 3 % ~00s
|++++ | 7 % ~00s
|++++++ | 10% ~00s
|+++++++ | 14% ~00s
|+++++++++ | 17% ~00s
|+++++++++++ | 21% ~00s
|+++++++++++++ | 24% ~00s
|++++++++++++++ | 28% ~00s
|++++++++++++++++ | 31% ~00s
|++++++++++++++++++ | 34% ~00s
|+++++++++++++++++++ | 38% ~00s
|+++++++++++++++++++++ | 41% ~00s
|+++++++++++++++++++++++ | 45% ~00s
|+++++++++++++++++++++++++ | 48% ~00s
|++++++++++++++++++++++++++ | 52% ~00s
|++++++++++++++++++++++++++++ | 55% ~00s
|++++++++++++++++++++++++++++++ | 59% ~00s
|++++++++++++++++++++++++++++++++ | 62% ~00s
|+++++++++++++++++++++++++++++++++ | 66% ~00s
|+++++++++++++++++++++++++++++++++++ | 69% ~00s
|+++++++++++++++++++++++++++++++++++++ | 72% ~00s
|++++++++++++++++++++++++++++++++++++++ | 76% ~00s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~00s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~00s
|++++++++++++++++++++++++++++++++++++++++++++ | 86% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s
top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
write.table(markers, file = paste0(outdir, "allmarkers.txt"), sep = "\t", quote = FALSE, col.names = NA)
write.table(top10, file = paste0(outdir, "top10markers.txt"), sep = "\t", quote = FALSE, col.names = NA)
DoHeatmap(df, features = top10$gene) + NoLegend()
The following features were omitted as they were not found in the scale.data slot for the Spatial assay: SCGB1A1
ggsave(paste0(outdir, "top10_heatmap.pdf"))
Saving 7.29 x 4.51 in image
table <- as.data.frame(table(df[[mycol]]))
ggplot(table, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity") +
coord_flip()
ggsave(file = paste0(outdir, "clustercounts.pdf"))
Saving 7.29 x 4.51 in image
rownames(table) <- table$Var1
table$Var1 <- NULL
write.table(table, file = paste0(outdir, "clustercounts.txt"), sep = "\t", quote = FALSE, col.names = NA)
saveRDS(df, file = paste0(outdir, sampleID, "_DF_annotated.RDS"))